/**************************************************************************
* This macro pulls pension records form 5500 data for 2000-2008
/*************************************************************************/

%macro pullplans_0008(yyyy,yy);

	/**************************************************************************
	* Step 1: Input the research file for year=yy
	***************************************************************************/
	/*the scale macro is to input dates correctly across files*/
	%if 20&yy. < 2007 %then %do;
		%let scale = /*redacted*/;
	%end;
	%else %do;
		%let scale = /*redacted*/;
	%end;

	data res&yy
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			large
			last_rpt_plan_num
			last_rpt_spons_ein
			opr_ein
			opr_pn
			partcp_account_bal_cnt
			pension_benefit_plan_ind
			soleplan
			tot_active_partcp_cnt
			type_pension_bnft_code
			type_plan_entity_ind
			source_id);
		set red.bul&yy (drop = DB DC);	/*remove pre-coded DB DC flags*/
		where best_for_plan = /*redacted*/;	/*keep best filing only (no duplicates)*/
		if /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		if /*redacted*/ then DB=/*redacted*/;	/*has DB characteristics*/
			else DB=/*redacted*/;
		if /*redacted*/ then cash_bal=/*redacted*/;	/*is cash balance*/
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB =/*redacted*/;		/*is DB but not cash_bal*/
			else xDB=/*redacted*/;
		if /*redacted*/ OR 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/

		%if 20&yy. < 2007 %then %do;
			scale = /*redacted*/;
		%end;
		%else %do;
			scale = /*redacted*/;
		%end;

		plan_qtr_strt = qtr(form_plan_year_begin_date/&scale);	/*plan calendar start qtr*/
		plan_year_strt = year(form_plan_year_begin_date/&scale);	/*plan calendar start year*/
		plan_qtr_end = qtr(form_tax_prd/&scale);			/*plan calendar end qtr*/
		plan_year_end = year(form_tax_prd/&scale);			/*plan calendar end year*/
		form_year = 20&yy;						/*form year*/
		filing_id_char = /*redacted*/;					/*string filing id*/
		source_id = /*redacted*/;        				/*id for research file*/
		plan_eff_year = year(eff_date);					/*year that the plan started*/
		if type_plan_entity_ind^=/*redacted*/ then delete;		/*keep only single employer plans*/
		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 		/*remove welfare plans*/
	run;

	/**************************************************************************
	* Step 2: Identify the set of records that are NOT in the research file 
	* but ARE in the raw file
	***************************************************************************/
	
	/*Input the raw file*/
	data raw&yy;
		set green.f5500fileyears_2000_2014;
		where fileyear=20&yy;
	run;
	
	proc sql;
		create table xclud&yy as
		select a.filing_id as res_id, b.*
		from res&yy as a right join
		raw&yy as b
		on a.filing_id = b.filing_id;
	quit;
	
	/*Obs in the raw file that are not in the research file*/
	data xclud&yy;
		set xclud&yy;
		if res_id^=. then delete; /*records not in the research file*/
	run;
	
	/**************************************************************************
	* Step 3: clean the xclud plans file and make variable names consistent
	***************************************************************************/
	
	data xclud&yy
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			large
			last_rpt_plan_num
			last_rpt_spons_ein
			opr_ein
			opr_pn
			partcp_account_bal_cnt
			pension_benefit_plan_ind
			tot_active_partcp_cnt
			type_pension_bnft_code
			type_plan_entity_ind
			t					/*this is used to de-duplicate*/
		);												
		set xclud&yy;
		if /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		if /*redacted*/ then DB=/*redacted*/;	/*has DB characteristics*/
			else DB=/*redacted*/;
		if /*redacted*/ then cash_bal=/*redacted*/;	/*is cash balance*/
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB =/*redacted*/;				/*is DB but not cash_bal*/
			else xDB=/*redacted*/;
		plan_name = upcase(plan_name);
		if /*redacted*/ or 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/
	
		form_plan_year_begin_date = left(trim(form_plan_year_begin_date));
		fpy_begin_yr = /*redacted*/; 	/*plan start year*/
		fpy_begin_mth = /*redacted*/; 	/*plan start month*/
		fpy_begin_day = /*redacted*/;	/*plan start day*/
		fpy_date = mdy(fpy_begin_mth,fpy_begin_day,fpy_begin_yr);		/*plan start sas date*/
		plan_qtr_strt = qtr(fpy_date);						/*plan calendar start qtr*/
		plan_year_strt = year(fpy_date);					/*plan calendar start year*/
	
		form_tax_prd = left(trim(form_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/
	
		form_year = fileyear;						/*rename for consistency with research file*/
		filing_id_char = /*redacted*/;				        /*string filing id*/
		adj_active = tot_active_partcp_cnt;				/*no editing for the raw file*/
		large = /*redacted*/; 							/*all the plans in this set are small*/
		opr_ein = left(trim(spons_dfe_ein));				/*rename for consistency with research file*/
		opr_pn = left(trim(spons_dfe_pn));				/*rename for consistency with research file*/
		len_pn = length(opr_pn);					/*length of pn field*/
		if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		else if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		/*Redacted: removing invalid EIN/PNs*/
		plan_eff_year = /*redacted*/;		/*year that the plan started*/	
		type_plan_entity_ind = left(trim(type_plan_entity_ind));
		if type_plan_entity_ind^=/*redacted*/ then delete;			/*keep only single employer plans*/s	
		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 			/*remove welfare plans*/
		pension_benefit_plan_ind = /*redacted*/;				/*all plans are now pension plans*/
		t = 1;								/*helper used for de-duplication step*/
	run;
	
	/*Set variable lengths to enable error-free appending with the research file*/
	data xclud&yy;
	
		/*redacted*/
		
		set xclud&yy;
	run;
		
	/**************************************************************************
	* Step 4: De-duplicate the file (pick the max filing id for each EIN-PN)
	***************************************************************************/
	
	proc sort data = xclud&yy;
		by opr_ein opr_pn plan_year_end;
	run;
	
	proc means noprint data = xclud&yy;
		by opr_ein opr_pn plan_year_end;
		var filing_id;
		output out = best_filing 
		max(filing_id) = max_filing_id;
	run;
	
	proc sql;
		create table xclud&yy._clean as
		select a.*, b.max_filing_id
		from xclud&yy as a inner join
		best_filing as b
		on a.filing_id = b.max_filing_id;
	quit;
	
	data  xclud&yy._clean;
		set xclud&yy._clean (drop= t max_filing_id);
		source_id = /*redacted*/; /*id for raw file*/
	run;
	
	/**************************************************************************
	* Step 5: Stack plans from the two files together
	***************************************************************************/
	
	data stack&yy;
		set res&yy xclud&yy._clean;
	run;
	
	/*
	  De-duplicate in case of repeated entries
	  research file entry gets priority when dups occur
	*/
	
	proc sort data = stack&yy;
		by opr_ein opr_pn plan_year_end;
	run;
	
	proc means noprint data = stack&yy;
		by opr_ein opr_pn plan_year_end;
		var source_id;
		output out = deduper
		min(source_id) = min_source_id;
	run;
	
	proc sql;
		create table pens&yy as
		select a.*, b.min_source_id, b.opr_ein, b.opr_pn
		from stack&yy as a left join
		deduper as b
		on a.opr_ein = b.opr_ein & a.opr_pn = b.opr_pn & a.plan_year_end = b.plan_year_end;
	quit;

	data pens&yy;
		set pens&yy;
		if source_id^=min_source_id then delete;
	run;

	/**************************************************************************
	* Step 6: Count up # DB plans per EIN
	***************************************************************************/
	
	proc sort data = pens&yy;
		by opr_ein;
	run;
	
	proc means noprint data = pens&yy;
		by opr_ein;
		var DB;				
		output out = plan_count&yy
			sum(DB) = num_DB_plans;
	run;
	
	/*Check how many participants are in firms with 2+ DB plans*/
	proc sql;
		create table pens&yy._plncnt as 
		select *
		from pens&yy as t,
		plan_count&yy as m
		where t.opr_ein = m.opr_ein;
	quit;
		
	data blue.pensplan_samp&yy (drop =
		xDB
		min_source_id
		soleplan
		_freq_
		_type_);
		set pens&yy._plncnt;
		adj_DB_partcp = DB*adj_active;
		DB_partcp = DB*tot_active_partcp_cnt;
	run;
	
	proc freq data = blue.pensplan_samp&yy;
	  tables DB*DC;
	run;

	/**************************************************************************
	* Step 7: Pull schedule H/I data (employer and employee contributions)
	***************************************************************************/
	
	/*Input schedule H raw data*/
	PROC IMPORT OUT= WORK.schh&yyyy 
		    DATAFILE= "/.../F5500/f5500/F_SCH_H_&yyyy..csv" 
		    DBMS=CSV REPLACE;
	    GETNAMES=YES;
	    DATAROW=/*redacted*/; 
	RUN;

	data schh&yyyy  (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
		set schh&yyyy (rename = (emplr_contrib_income_amt=emplr_contrib_income_amt_n 
		participant_contrib_amt=employee_contrib_income_amt_n
		tot_income_amt = tot_income_amt_n));
		filing_id_char = /*redacted*/;
		opr_ein = left(trim(sch_h_ein));
		opr_pn = left(trim(sch_h_pn));
		schh_tax_prd = left(trim(sch_h_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 			/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/
	run;

	/*Input schedule I raw data*/
	PROC IMPORT OUT= WORK.schi&yyyy 
		    DATAFILE= "/.../F5500/f5500/F_SCH_I_&yyyy..csv" 
		    DBMS=CSV REPLACE;
	    GETNAMES=YES;
	    DATAROW=/*redacted*/; 
	RUN;

	data schi&yyyy (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
		set schi&yyyy (rename = (small_emplr_contrib_income_amt=emplr_contrib_income_amt_n 
		small_participant_contrib_amt=employee_contrib_income_amt_n
		small_tot_income_amt = tot_income_amt_n));
		filing_id_char = /*redacted*/;
		opr_ein = left(trim(sch_i_ein));
		opr_pn = left(trim(sch_i_plan_num));
		schi_tax_prd = left(trim(sch_i_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/		
	run;

	/*Append the files*/
	data blue.schhi&yyyy;
	      set schh&yyyy schi&yyyy;
	run;
	
	/*De-duplicate H and I schedules*/
	proc sort data = blue.schhi&yyyy nodupkey;
	      by filing_id_char;
	run;	

	/*Clean up the working directory*/
	proc datasets lib=work nolist kill;
	quit;
	run;

%mend pullplans_0008;
